#############################################################################
#File working with Data for Krause-Lewis JPAM Paper, August, 2025
#Title: "Obtaining Comparable Measures of Agency Performance: An Application 
#to U.S. Federal Agencies, 2002-2024"
#Start Date: October 21, 2023
#Update Date: January 16, 2024
#Update Date: August 8, 2025
#
#Description: This file includes the commands used to create the Figures 2-7 
#that compare estimates of Theta to other data for external validity. Our
#raw data produced a set of estimates in Model 1 in the paper. We merge these
#new estimates with the original dataset and prodcue the figures.
#
#Data: nfy_data_for_correlations_080825.dta
##############################################################################

#First, I'll start with adding some useful packages.
#This first command removes all data in short term memory.
rm(list=ls())

#This command sets the working directory.

#Adding package for adding and joining datasets
install.packages("tidyverse")
library(tidyverse)
library(readxl)

#This package allows me to read in STATA files.
library (haven)
#This package allows the calculation of marginal effects
install.packages("margins")
library(margins)

#Now installing ggplot
install.packages("ggplot2")
install.packages("colorspace")
library(colorspace)
library(ggplot2)

#This package will let me do easy tests of means
install.packages("ggpubr")
install.packages("ggrepel")
install.packages("ggthemes")
library(ggpubr)

#This package will let me export regression tables.
install.packages("stargazer")
library(stargazer)

##

##Figure 1: Agency_perf by Skills
library(readxl)
library(ggrepel)
library(ggpubr)
library(ggplot2)
library(foreign)
library(readxl)
library(haven)
library(ggthemes)

#Here I load up the dataset with the new estimates and variables we will use for validation

theta_est<- read_dta("C:/Users/lewisde/Dropbox/Agency Performance Measurement (David Lewis)/JPAM_R_&R/JPAM Conditional Acceptance Revisions/nfy_data_for_correlations_080825.dta")

#This creates a subset of just CFO Act agencies and excludes years outside the basic window.
df_filtered <- theta_est %>% filter(cfoact == 1 & year >= 2002 & year <= 2024)

##Figure 2: Here is the code to create the descriptive graph for start of each administration

obama1 <-ggplot(df_filtered[which(df_filtered$year==2008&df_filtered$F1MedianM1!=""&df_filtered$bureau==0),], mapping = aes(x=reorder(bureau_acr, F1MedianM1), y=F1MedianM1))+
  geom_point(size=0.4)+
  labs (subtitle="Before Obama Administration (2008)")+
  theme(axis.text=element_text(size =0.2))+
  geom_pointrange(mapping = aes(ymin = F1_25LCL_m1nfy, ymax = F1_975UCL_m1nfy))+
  labs(x="", y="")+ coord_flip()+theme_minimal()+scale_y_continuous(limits=c(-0.75,0.75))

trump1 <-ggplot(df_filtered[which(df_filtered$year==2016&df_filtered$F1MedianM1!=""&df_filtered$bureau==0),], mapping = aes(x=reorder(bureau_acr, F1MedianM1), y=F1MedianM1))+
  geom_point(size=0.4)+
  labs (subtitle="Before Trump Administration (2016)")+
  theme(axis.text=element_text(size =0.2))+
  geom_pointrange(mapping = aes(ymin = F1_25LCL_m1nfy, ymax = F1_975UCL_m1nfy))+
  labs(x="", y="")+ coord_flip()+theme_minimal()+scale_y_continuous(limits=c(-0.75,0.75))

biden1 <-ggplot(df_filtered[which(df_filtered$year==2020&df_filtered$F1MedianM1!=""&df_filtered$bureau==0),], mapping = aes(x=reorder(bureau_acr, F1MedianM1), y=F1MedianM1))+
  geom_point(size=0.4)+
  labs (subtitle="Before Biden Administration (2020)")+
  theme(axis.text=element_text(size =0.2))+
  geom_pointrange(mapping = aes(ymin = F1_25LCL_m1nfy, ymax = F1_975UCL_m1nfy))+
  labs(x="", y="")+ coord_flip()+theme_minimal()+scale_y_continuous(limits=c(-0.75,0.75))

trump2 <-ggplot(df_filtered[which(df_filtered$year==2024&df_filtered$F1MedianM1!=""&df_filtered$bureau==0),], mapping = aes(x=reorder(bureau_acr, F1MedianM1), y=F1MedianM1))+
  geom_point(size=0.4)+
  labs (subtitle="Before Trump Administration (2024)")+
  theme(axis.text=element_text(size =0.2))+
  geom_pointrange(mapping = aes(ymin = F1_25LCL_m1nfy, ymax = F1_975UCL_m1nfy))+
  labs(x="", y="")+ coord_flip()+theme_minimal()+scale_y_continuous(limits=c(-0.75,0.75))

figure2 <- ggarrange(obama1, trump1, biden1, trump2,
                     
                     ncol = 2, nrow = 2, align = "h")

annotate_figure(figure2, bottom =text_grob ("Theta Estimate"), lef =text_grob ("Department or Agency", rot=90))

##Figure 3: Here is the code to create the descriptive graph for start of each administration by subcomponents

dhs_sub <-ggplot(theta_est[which(theta_est$year==2024&theta_est$F1MedianM1!=""&theta_est$OKCODE_parent==11),], mapping = aes(x=reorder(bureau_acr, F1MedianM1), y=F1MedianM1))+
  geom_point(size=0.4)+
  labs (subtitle="DHS Subcomponents (2024)")+
  theme(axis.text=element_text(size =0.2))+
  geom_pointrange(mapping = aes(ymin = F1_25LCL_m1nfy, ymax = F1_975UCL_m1nfy))+
  labs(x="", y="")+ coord_flip()+theme_minimal()+scale_y_continuous(limits=c(-0.75,0.75))

hhs_sub <-ggplot(theta_est[which(theta_est$year==2024&theta_est$F1MedianM1!=""&theta_est$OKCODE_parent==9),], mapping = aes(x=reorder(bureau_acr, F1MedianM1), y=F1MedianM1))+
  geom_point(size=0.4)+
  labs (subtitle="HHS Subcomponents (2024)")+
  theme(axis.text=element_text(size =0.2))+
  geom_pointrange(mapping = aes(ymin = F1_25LCL_m1nfy, ymax = F1_975UCL_m1nfy))+
  labs(x="", y="")+ coord_flip()+theme_minimal()+scale_y_continuous(limits=c(-0.75,0.75))

doj_sub <-ggplot(theta_est[which(theta_est$year==2024&theta_est$F1MedianM1!=""&theta_est$OKCODE_parent==14),], mapping = aes(x=reorder(bureau_acr, F1MedianM1), y=F1MedianM1))+
  geom_point(size=0.4)+
  labs (subtitle="DOJ Subcomponents (2024)")+
  theme(axis.text=element_text(size =0.2))+
  geom_pointrange(mapping = aes(ymin = F1_25LCL_m1nfy, ymax = F1_975UCL_m1nfy))+
  labs(x="", y="")+ coord_flip()+theme_minimal()+scale_y_continuous(limits=c(-0.75,0.75))

dol_sub <-ggplot(theta_est[which(theta_est$year==2024&theta_est$F1MedianM1!=""&theta_est$OKCODE_parent==15),], mapping = aes(x=reorder(bureau_acr, F1MedianM1), y=F1MedianM1))+
  geom_point(size=0.4)+
  labs (subtitle="DOL Subcomponents (2024)")+
  theme(axis.text=element_text(size =0.2))+
  geom_pointrange(mapping = aes(ymin = F1_25LCL_m1nfy, ymax = F1_975UCL_m1nfy))+
  labs(x="", y="")+ coord_flip()+theme_minimal()+scale_y_continuous(limits=c(-0.75,0.75))

figure3 <- ggarrange(dhs_sub, hhs_sub, doj_sub, dol_sub,
                     
                     ncol = 2, nrow = 2, align = "h")

annotate_figure(figure3, bottom =text_grob ("Theta Estimate"), lef =text_grob ("Department or Agency", rot=90))

##Figure 4: Boxplot

# Load necessary libraries
library(ggplot2)
library(dplyr)

# Assuming df is your dataframe


# Filter the data where cfoact=1 and year is between 2000 and 2022
df_filtered <- theta_est %>% filter(cfoact == 1 & year >= 2002 & year <= 2024)


# Calculate median values for each bureau and sort in descending order
medians <- df_filtered %>% group_by(bureau_acr) %>% summarise(median = median(F1MedianM1, na.rm = TRUE)) %>% arrange(median)

# Add a new column to the original dataframe with the ordered factor levels
df_filtered$bureau_acr_ordered <- factor(df_filtered$bureau_acr, levels = medians$bureau_acr)

# Create the boxplot
bp<-ggplot(df_filtered, aes(y = bureau_acr_ordered, x = F1MedianM1)) +
  geom_boxplot() +
  xlim(-0.5, 0.5) +
  theme_minimal() +
  theme(
    title = element_blank(),
    axis.title.x = element_blank(),
    axis.title.y = element_blank()
  )

##Figure 5: Estimates by year.

s <-ggplot(theta_est[which(theta_est$cfoact==1),], mapping=aes(x=year, y=F1MedianM1))
s + geom_line(aes(group=bureau_acr))+facet_wrap(~bureau_acr)+ylab("Estimated Theta-First Dimension")+
  theme_bw()+
  geom_ribbon(aes(ymin=F1_25LCL_m1nfy, ymax=F1_975UCL_m1nfy), alpha=0.1)+
  scale_x_discrete(limits=c(2002,2010,2020))+ 
  scale_y_continuous(limits=c(-0.75, 0.75))

##Figure 6: Figure Comparing Theta Estimates to SFGS Performance Measure from Richardson et al. 2023

w<-ggplot(theta_est[which(theta_est$year==2020),],mapping = aes(x=F1MedianM1 , y= perf_rating_inf)) + 
  geom_text (aes(label=bureau_acr), size=2, position = position_jitter(0.15), check_overlap = T) +
  geom_smooth(method=lm, color='black')+
  labs (title="Agency Expert Rating (2020)")+
  theme(axis.title.x = element_blank())+
  theme(axis.title.y = element_blank())+
  stat_cor(method="pearson", size=3, label.y=2.5)+
  scale_x_continuous(limits = c(-0.75, 0.75))+
  scale_y_continuous(limits=c(-2.5, 2.5))

x<-ggplot(theta_est[which(theta_est$year==2020),], mapping = aes(x=F1MedianM1 , y= agency_perf))+ 
  geom_text (aes(label=bureau_acr), size=2, position = position_jitter(0.15), check_overlap = T) +
  geom_smooth(method=lm, color='black')+
  labs (title="Self-Reported Performance (2020)")+
 
  theme(axis.title.x = element_blank())+
  theme(axis.title.y = element_blank())+
  stat_cor(method="pearson", size=3, label.y=5.5)+
  scale_x_continuous(limits = c(-0.75, 0.75))+
  scale_y_continuous(limits=c(1,5.5))

y<-ggplot(theta_est[which(theta_est$year==2014),],mapping = aes(x=F1MedianM1 , y= core_mission)) + 
  geom_text (aes(label=bureau_acr), size=2, position = position_jitter(0.15), check_overlap = T) +
  geom_smooth(method=lm, color='black')+
  labs (title="Core Mission (2014)")+
  theme(axis.title.x = element_blank())+
  theme(axis.title.y = element_blank())+
  stat_cor(method="pearson", size=3, label.y=5.5)+
  scale_x_continuous(limits = c(-0.75, 0.75))+
  scale_y_continuous(limits=c(1,5.5))

z<-ggplot(theta_est[which(theta_est$part_obs>2),],mapping = aes(x=F1MedianM1 , y= Section4))+ 
  geom_text (aes(label=bureau_acr), size=2, position = position_jitter(0.15), check_overlap = T) +
  geom_smooth(method=lm, color='black')+
  labs (title="PART Results Demonstrated (2002 - 2008)")+
  theme(axis.title.x = element_blank())+
  theme(axis.title.y = element_blank())+
  stat_cor(method="pearson", size=3, label.y=0.9)+
  scale_x_continuous(limits = c(-0.75, 0.75))+
  scale_y_continuous(limits=c(0, 1))

figure6<- ggarrange(w,x,y,z, ncol=2, nrow=2, align="h")
annotate_figure(figure6, bottom =text_grob ("Performance Estimate"), lef =text_grob ("SFGS Performance Questions (2014, 2020)", rot=90))

##Figure 7: Out of Sample Figures -- FEVS COVID Questions vs. First Dimension Medians

b<-ggplot(theta_est, aes(x= F1MedianM1  , y= g_v17_03)) + 
  geom_text (aes(label=bureau_acr), size=2.5, position = position_jitter(0.15), check_overlap = T) +
  geom_smooth(method=lm, color='black')+
  theme(axis.title.y = element_blank(), plot.title = element_text(size=10))+
  ggtitle("Pre_COVID-19 High Quality Work")+
  stat_cor(method="pearson", size=3)+
  xlab("")+
  ylab("")+
  scale_x_continuous(limits = c(-0.75, 0.75))+
  scale_y_continuous(limits=c(4.1,4.8))

c<-ggplot(theta_est, aes(x= F1MedianM1 , y= g_v18_03)) + 
  geom_text (aes(label=bureau_acr), size=2.5, position = position_jitter(0.15), check_overlap = T) +
  geom_smooth(method=lm, color='black')+
  theme(axis.title.y = element_blank(), plot.title = element_text(size=10))+
  ggtitle("COVID-19 High Quality Work")+
  stat_cor(method="pearson", size=3)+
  xlab("")+
  ylab("")+
  scale_x_continuous(limits = c(-0.75, 0.75))+
  scale_y_continuous(limits=c(4.1,4.8))

d<-ggplot(theta_est, aes(x= F1MedianM1  , y= g_v17_06)) + 
  geom_text (aes(label=bureau_acr), size=2.5, position = position_jitter(0.15), check_overlap = T) +
  geom_smooth(method=lm, color='black')+
  theme(axis.title.y = element_blank(), plot.title = element_text(size=10))+
  ggtitle("Pre_COVID-19 Achieved Goals")+
  stat_cor(method="pearson", size=3)+
  xlab("")+
  ylab("")+
  scale_x_continuous(limits = c(-0.75, 0.75))+
  scale_y_continuous(limits=c(4.1,4.8))

e<-ggplot(theta_est, aes(x= F1MedianM1  , y= g_v18_06)) + 
  geom_text (aes(label=bureau_acr), size=2.5, position = position_jitter(0.15), check_overlap = T) +
  geom_smooth(method=lm, color='black')+
  theme(axis.title.y = element_blank(), plot.title = element_text(size=10))+
  ggtitle("COVID-19 Achieved Goals")+
  stat_cor(method="pearson", size=3)+
  xlab("")+
  ylab("")+
  scale_x_continuous(limits = c(-0.75, 0.75))+
  scale_y_continuous(limits=c(4.1,4.8))

figure7 <- ggarrange(b, c, d, e,
                     
                     ncol = 2, nrow = 2, align = "h")

annotate_figure(figure7, bottom =text_grob ("Theta Estimate"), lef =text_grob ("2020 Unique COVID-19 Performance Questions (FEVS)", rot=90))


